# Fix outliers
outlierKD2 <- function(df, var, rm=FALSE) { 
    #' Original outlierKD functino by By Klodian Dhana,
    #' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
    #' Modified to have third argument for removing outliers instead of interactive prompt, 
    #' and after removing outlier, original df will not be changed. The function returns the a df, 
    #' which can be saved as original df name if desired.
    #' Also added QQ-plot in the output.
    #' Check outliers, and option to remove them, save as a new dataframe. 
    #' @param df The dataframe.
    #' @param var The variable in the dataframe to be checked for outliers
    #' @param rm Boolean. Whether to remove outliers or not.
    #' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
    #' @examples
    #' outlierKD2(mydf, height, FALSE)
    #' mydf = outlierKD2(mydf, height, TRUE)
    #' mydfnew = outlierKD2(mydf, height, TRUE)
    dt = df # duplicate the dataframe for potential alteration
    var_name <- eval(substitute(var),eval(dt))
    na1 <- sum(is.na(var_name))
    m1 <- mean(var_name, na.rm = T)
    par(mfrow=c(2, 2), oma=c(0,0,3,0))
    boxplot(var_name, main="With outliers")
    hist(var_name, main="With outliers", xlab=NA, ylab=NA)
    qqnorm(var_name, main = "With outliers")
    qqline(var_name)
    outlier <- boxplot.stats(var_name)$out
    mo <- mean(outlier)
    var_name <- ifelse(var_name %in% outlier, NA, var_name)
    boxplot(var_name, main="Without outliers")
    hist(var_name, main="Without outliers", xlab=NA, ylab=NA)
    qqnorm(var_name, main = "Without outliers")
    qqline(var_name)
    title("Outlier Check", outer=TRUE)
    na2 <- sum(is.na(var_name))
    cat("Outliers identified:", na2 - na1, "\n")
    cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
    cat("Mean of the outliers:", round(mo, 2), "\n")
    m2 <- mean(var_name, na.rm = T)
    cat("Mean without removing outliers:", round(m1, 2), "\n")
    cat("Mean if we remove outliers:", round(m2, 2), "\n")
    
    # response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
    # if(response == "y" | response == "yes"){
    if(rm){
        dt[as.character(substitute(var))] <- invisible(var_name)
        #assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
        cat("Outliers successfully removed", "\n")
        return(invisible(dt))
    } else {
        cat("Nothing changed", "\n")
        return(invisible(df))
    }
}
loadPkg("xtable")
loadPkg("kableExtra")
loadPkg("stringi")


xkabledply = function(modelsmmrytable, title="Table", digits = 4, pos="left", bso="striped", wide=FALSE) { 
  #' Combining base::summary, xtable, and kableExtra, to easily display model summary. 
  #' wrapper for the base::summary function on model objects
  #' ELo 202004 GWU DATS
  #' version 1.2
  #' @param modelsmmrytable This can be a generic table, a model object such as lm(), or the summary of a model object summary(lm()) 
  #' @param title Title of table. 
  #' @param digits Number of digits to display
  #' @param pos Position of table, c("left","center","right") 
  #' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
  #' @return HTML table for display
  #' @examples
  #' library("xtable")
  #' library("kableExtra")
  #' xkabledply( df, title="Table testing", pos="left", bso="hover" )
  if (wide) { modelsmmrytable <- t(modelsmmrytable) }
  modelsmmrytable %>%
    xtable() %>% 
    kable(caption = title, digits = digits) %>%
    kable_styling(bootstrap_options = bso, full_width = FALSE, position = pos)
}

xkablesummary = function(df, title="Table: Statistics summary.", digits = 4, pos="left", bso="striped") { 
  #' Combining base::summary, xtable, and kableExtra, to easily display numeric variable summary of dataframes. 
  #' ELo 202004 GWU DATS
  #' version 1.2
  #' @param df The dataframe.
  #' @param title Title of table. 
  #' @param digits Number of digits to display
  #' @param pos Position of table, c("left","center","right") 
  #' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
  #' @return The HTML summary table for display, or for knitr to process into other formats 
  #' @examples
  #' xkablesummary( faraway::ozone )
  #' xkablesummary( ISLR::Hitters, title="Five number summary", pos="left", bso="hover"  )
  
  s = summary(df) %>%
    apply( 2, function(x) stringr::str_remove_all(x,c("Min.\\s*:\\s*","1st Qu.\\s*:\\s*","Median\\s*:\\s*","Mean\\s*:\\s*","3rd Qu.\\s*:\\s*","Max.\\s*:\\s*")) ) %>% # replace all leading words
    apply( 2, function(x) stringr::str_trim(x, "right")) # trim trailing spaces left
  
  colnames(s) <- stringr::str_trim(colnames(s))
  
  if ( dim(s)[1] ==6 ) { rownames(s) <- c('Min','Q1','Median','Mean','Q3','Max') 
  } else if ( dim(s)[1] ==7 ) { rownames(s) <- c('Min','Q1','Median','Mean','Q3','Max','NA') }
  
  xkabledply(s, title=title, digits = digits, pos=pos, bso=bso )
}

xkablevif = function(model, title="VIFs of the model", digits = 3, pos="left", bso="striped", wide=FALSE) { 
  #' Combining faraway::vif, xtable, and kableExtra, to easily display numeric summary of VIFs for a model. 
  #' ELo 202004 GWU DATS
  #' version 1.2
  #' @param model The lm or compatible model object.
  #' @param title Title of table. 
  #' @param digits Number of digits to display
  #' @param pos Position of table, c("left","center","right") 
  #' @param bso bootstrap_options = c("basic", "striped", "bordered", "hover", "condensed", "responsive")
  #' @param wide print table in long (FALSE) format or wide (TRUE) format
  #' @return The HTML summary table of the VIFs for a model for display, or for knitr to process into other formats 
  #' @examples
  #' xkablevif( lm(Salary~Hits+RBI, data=ISLR::Hitters, wide=T ) )
  
  vifs = table( names(model$coefficients)[2:length(model$coefficients)] ) # remove intercept to set column names
  vifs[] = faraway::vif(model) # set the values
  if (wide) { vifs <- t(vifs) }
  xkabledply( vifs, title=title, digits = digits, pos=pos, bso=bso )
}

Introduction

Wine’s quality grade is affected by mant different factors. To better understand which factor could influence wine’s quality, we sttarted this project. The main objective of our study is to see whether these is an association between The importance of this subject arises from a reccurring topic that we see in the relationship between wine’s quality grade and wine’s description. This topic is brought up often but the question stands whether there is enough evidence to prove that wine’s quality grade and wine’s description is the same.

First, we import the data set wine.csv as wine, and check the structure. And then make are stored as factors, instead of numeric or integers.

wine<-data.frame(read.csv("wine.csv"))
str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...
length(wine$quality)
## [1] 4898
wine$quality=as.factor(wine$quality)
str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : Factor w/ 7 levels "3","4","5","6",..: 4 4 4 4 4 4 4 4 4 4 ...
xkablesummary(wine)
Table: Statistics summary.
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol quality
Min 3.80 0.080 0.000 0.6 0.009 2.0 9 0.987 2.72 0.22 8.0 3: 20
Q1 6.30 0.210 0.270 1.7 0.036 23.0 108 0.992 3.09 0.41 9.5 4: 163
Median 6.80 0.260 0.320 5.2 0.043 34.0 134 0.994 3.18 0.47 10.4 5:1457
Mean 6.85 0.278 0.334 6.4 0.046 35.3 138 0.994 3.19 0.49 10.5 6:2198
Q3 7.30 0.320 0.390 9.9 0.050 46.0 167 0.996 3.28 0.55 11.4 7: 880
Max 14.20 1.100 1.660 65.8 0.346 289.0 440 1.039 3.82 1.08 14.2 8: 175
NA NA NA NA NA NA NA NA NA NA NA NA 9: 5

Then, we want to remove the outliers. Firstly, we want to do the boxplot to see which variables have outliers, then we use outlierKD2 to remove the outliers.

qqnorm(wine$fixed.acidity, main="qq plot of wine's fixed acidity")

qqnorm(wine$volatile.acidity,main="qqplot of wine's volatile.acidity")

qqnorm(wine$citric.acid,main="qqplot of wine's citric.acid")

qqnorm(wine$residual.sugar,main="qqplot of wine's residual.sugar")

qqnorm(wine$chlorides,main="qqplot of wine's chlorides")

qqnorm(wine$free.sulfur.dioxide,main="qqplot of wine's free.sulfur.dioxide")

qqnorm(wine$total.sulfur.dioxide,main="qqplot of wine's total.sulfur.dioxide")

qqnorm(wine$density,main="qqplot of wine's density")

qqnorm(wine$pH,main="qqplot of wine's pH",xlab="pH")

qqnorm(wine$sulphates,main="qqplot of wine's sulphates")

qqnorm(wine$alcohol,main="qqplot of wine's alcohol")

After the qqplot, we could see that all these variables have outliers, so we use outlierKD2 function to remove the outliers.

wine<-outlierKD2(wine,fixed.acidity,TRUE)

## Outliers identified: 119 
## Propotion (%) of outliers: 2.5 
## Mean of the outliers: 8.81 
## Mean without removing outliers: 6.85 
## Mean if we remove outliers: 6.81 
## Outliers successfully removed
wine<-outlierKD2(wine,volatile.acidity,TRUE)

## Outliers identified: 186 
## Propotion (%) of outliers: 3.9 
## Mean of the outliers: 0.59 
## Mean without removing outliers: 0.28 
## Mean if we remove outliers: 0.27 
## Outliers successfully removed
wine<-outlierKD2(wine,citric.acid,TRUE)

## Outliers identified: 270 
## Propotion (%) of outliers: 5.8 
## Mean of the outliers: 0.49 
## Mean without removing outliers: 0.33 
## Mean if we remove outliers: 0.33 
## Outliers successfully removed
wine<-outlierKD2(wine,residual.sugar,TRUE)

## Outliers identified: 7 
## Propotion (%) of outliers: 0.1 
## Mean of the outliers: 32.5 
## Mean without removing outliers: 6.39 
## Mean if we remove outliers: 6.35 
## Outliers successfully removed
wine<-outlierKD2(wine,chlorides,TRUE)

## Outliers identified: 208 
## Propotion (%) of outliers: 4.4 
## Mean of the outliers: 0.12 
## Mean without removing outliers: 0.05 
## Mean if we remove outliers: 0.04 
## Outliers successfully removed
wine<-outlierKD2(wine,free.sulfur.dioxide,TRUE)

## Outliers identified: 50 
## Propotion (%) of outliers: 1 
## Mean of the outliers: 101 
## Mean without removing outliers: 35.3 
## Mean if we remove outliers: 34.6 
## Outliers successfully removed
wine<-outlierKD2(wine,total.sulfur.dioxide,TRUE)

## Outliers identified: 19 
## Propotion (%) of outliers: 0.4 
## Mean of the outliers: 226 
## Mean without removing outliers: 138 
## Mean if we remove outliers: 138 
## Outliers successfully removed
wine<-outlierKD2(wine,density,TRUE)

## Outliers identified: 5 
## Propotion (%) of outliers: 0.1 
## Mean of the outliers: 1.01 
## Mean without removing outliers: 0.99 
## Mean if we remove outliers: 0.99 
## Outliers successfully removed
wine<-outlierKD2(wine,pH,TRUE)

## Outliers identified: 75 
## Propotion (%) of outliers: 1.6 
## Mean of the outliers: 3.55 
## Mean without removing outliers: 3.19 
## Mean if we remove outliers: 3.18 
## Outliers successfully removed
wine<-outlierKD2(wine,sulphates,TRUE)

## Outliers identified: 124 
## Propotion (%) of outliers: 2.6 
## Mean of the outliers: 0.84 
## Mean without removing outliers: 0.49 
## Mean if we remove outliers: 0.48 
## Outliers successfully removed
wine<-outlierKD2(wine,alcohol,TRUE)

## Outliers identified: 0 
## Propotion (%) of outliers: 0 
## Mean of the outliers: NaN 
## Mean without removing outliers: 10.5 
## Mean if we remove outliers: 10.5 
## Outliers successfully removed